# 데이터 불러오기
data(iris)

table(is.na(iris)) #빈도 계산. False가 결측치 아닌거, True가 결측치인거. 
## 
## FALSE 
##   750
colSums(is.na(iris)) #컬럼별 결측치
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0
#Sepal.Length 컬럼으로 히스토그램 그려보자
summary(iris$Sepal.Length)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.300   5.100   5.800   5.843   6.400   7.900
#히스토그램, 아이리스 데이터의 Sepal.Length를 이용할거고. x축의 라벨링 = xlab
#히스토그램의 색깔 정할땐 col. 제목은 main, xlim은 x축의 범위를 4.3~7.9로 지정.
hist(iris$Sepal.Length, xlab = "iris$Sepal.Length", col = "magenta", 
     main = "iris 꽃 받침 길이 Histogram", xlim = c(4.3, 7.9))

#Sepal.Width도 그려보자
summary(iris$Sepal.Width)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.800   3.000   3.057   3.300   4.400
hist(iris$Sepal.Width, xlab = "iris$Sepal.Width", col = "mistyrose", 
     main = "iris 꽃받침 너비 Histogram", xlim = c(2.0, 4.5))

#품종별 꽃받침 너비 누가 클깡
#y~x => y는 x에 대하여
#꽃받침 너비를 품종에 대하여~ 나타내보장~ 이런거지. 종에다른 꽃받침 너비.
boxplot(data = iris, Sepal.Width ~ Species)

# 품종별 꽃잎 길이 비교#평균
aggregate(Petal.Length ~ Species, data = iris, FUN = mean)
##      Species Petal.Length
## 1     setosa        1.462
## 2 versicolor        4.260
## 3  virginica        5.552
#상관분석을 해보자. 각 변수간의 연관된 정도를 파악.
#영향을 미치는 변수는 무엇이냐~
test_s <- subset(iris[,1:4], iris$Species == 'setosa')
test_s
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1           5.1         3.5          1.4         0.2
## 2           4.9         3.0          1.4         0.2
## 3           4.7         3.2          1.3         0.2
## 4           4.6         3.1          1.5         0.2
## 5           5.0         3.6          1.4         0.2
## 6           5.4         3.9          1.7         0.4
## 7           4.6         3.4          1.4         0.3
## 8           5.0         3.4          1.5         0.2
## 9           4.4         2.9          1.4         0.2
## 10          4.9         3.1          1.5         0.1
## 11          5.4         3.7          1.5         0.2
## 12          4.8         3.4          1.6         0.2
## 13          4.8         3.0          1.4         0.1
## 14          4.3         3.0          1.1         0.1
## 15          5.8         4.0          1.2         0.2
## 16          5.7         4.4          1.5         0.4
## 17          5.4         3.9          1.3         0.4
## 18          5.1         3.5          1.4         0.3
## 19          5.7         3.8          1.7         0.3
## 20          5.1         3.8          1.5         0.3
## 21          5.4         3.4          1.7         0.2
## 22          5.1         3.7          1.5         0.4
## 23          4.6         3.6          1.0         0.2
## 24          5.1         3.3          1.7         0.5
## 25          4.8         3.4          1.9         0.2
## 26          5.0         3.0          1.6         0.2
## 27          5.0         3.4          1.6         0.4
## 28          5.2         3.5          1.5         0.2
## 29          5.2         3.4          1.4         0.2
## 30          4.7         3.2          1.6         0.2
## 31          4.8         3.1          1.6         0.2
## 32          5.4         3.4          1.5         0.4
## 33          5.2         4.1          1.5         0.1
## 34          5.5         4.2          1.4         0.2
## 35          4.9         3.1          1.5         0.2
## 36          5.0         3.2          1.2         0.2
## 37          5.5         3.5          1.3         0.2
## 38          4.9         3.6          1.4         0.1
## 39          4.4         3.0          1.3         0.2
## 40          5.1         3.4          1.5         0.2
## 41          5.0         3.5          1.3         0.3
## 42          4.5         2.3          1.3         0.3
## 43          4.4         3.2          1.3         0.2
## 44          5.0         3.5          1.6         0.6
## 45          5.1         3.8          1.9         0.4
## 46          4.8         3.0          1.4         0.3
## 47          5.1         3.8          1.6         0.2
## 48          4.6         3.2          1.4         0.2
## 49          5.3         3.7          1.5         0.2
## 50          5.0         3.3          1.4         0.2
cor(test_s)
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000   0.7425467    0.2671758   0.2780984
## Sepal.Width     0.7425467   1.0000000    0.1777000   0.2327520
## Petal.Length    0.2671758   0.1777000    1.0000000   0.3316300
## Petal.Width     0.2780984   0.2327520    0.3316300   1.0000000
plot(test_s)

corrplot(cor(test_s))

#이건 꽃받침 빌이랑 너비가 관계있네

test_ver <- subset(iris[,1:4], iris$Species == 'versicolor')
test_ver
##     Sepal.Length Sepal.Width Petal.Length Petal.Width
## 51           7.0         3.2          4.7         1.4
## 52           6.4         3.2          4.5         1.5
## 53           6.9         3.1          4.9         1.5
## 54           5.5         2.3          4.0         1.3
## 55           6.5         2.8          4.6         1.5
## 56           5.7         2.8          4.5         1.3
## 57           6.3         3.3          4.7         1.6
## 58           4.9         2.4          3.3         1.0
## 59           6.6         2.9          4.6         1.3
## 60           5.2         2.7          3.9         1.4
## 61           5.0         2.0          3.5         1.0
## 62           5.9         3.0          4.2         1.5
## 63           6.0         2.2          4.0         1.0
## 64           6.1         2.9          4.7         1.4
## 65           5.6         2.9          3.6         1.3
## 66           6.7         3.1          4.4         1.4
## 67           5.6         3.0          4.5         1.5
## 68           5.8         2.7          4.1         1.0
## 69           6.2         2.2          4.5         1.5
## 70           5.6         2.5          3.9         1.1
## 71           5.9         3.2          4.8         1.8
## 72           6.1         2.8          4.0         1.3
## 73           6.3         2.5          4.9         1.5
## 74           6.1         2.8          4.7         1.2
## 75           6.4         2.9          4.3         1.3
## 76           6.6         3.0          4.4         1.4
## 77           6.8         2.8          4.8         1.4
## 78           6.7         3.0          5.0         1.7
## 79           6.0         2.9          4.5         1.5
## 80           5.7         2.6          3.5         1.0
## 81           5.5         2.4          3.8         1.1
## 82           5.5         2.4          3.7         1.0
## 83           5.8         2.7          3.9         1.2
## 84           6.0         2.7          5.1         1.6
## 85           5.4         3.0          4.5         1.5
## 86           6.0         3.4          4.5         1.6
## 87           6.7         3.1          4.7         1.5
## 88           6.3         2.3          4.4         1.3
## 89           5.6         3.0          4.1         1.3
## 90           5.5         2.5          4.0         1.3
## 91           5.5         2.6          4.4         1.2
## 92           6.1         3.0          4.6         1.4
## 93           5.8         2.6          4.0         1.2
## 94           5.0         2.3          3.3         1.0
## 95           5.6         2.7          4.2         1.3
## 96           5.7         3.0          4.2         1.2
## 97           5.7         2.9          4.2         1.3
## 98           6.2         2.9          4.3         1.3
## 99           5.1         2.5          3.0         1.1
## 100          5.7         2.8          4.1         1.3
cor(test_ver)
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000   0.5259107    0.7540490   0.5464611
## Sepal.Width     0.5259107   1.0000000    0.5605221   0.6639987
## Petal.Length    0.7540490   0.5605221    1.0000000   0.7866681
## Petal.Width     0.5464611   0.6639987    0.7866681   1.0000000
plot(test_ver)

corrplot(cor(test_ver))

#이건 꽃받침 길이랑 꽃잎길이 | 꽃잎길이랑 꽃잎너비

test_vi <- subset(iris[,1:4], iris$Species == 'virginica')
test_vi
##     Sepal.Length Sepal.Width Petal.Length Petal.Width
## 101          6.3         3.3          6.0         2.5
## 102          5.8         2.7          5.1         1.9
## 103          7.1         3.0          5.9         2.1
## 104          6.3         2.9          5.6         1.8
## 105          6.5         3.0          5.8         2.2
## 106          7.6         3.0          6.6         2.1
## 107          4.9         2.5          4.5         1.7
## 108          7.3         2.9          6.3         1.8
## 109          6.7         2.5          5.8         1.8
## 110          7.2         3.6          6.1         2.5
## 111          6.5         3.2          5.1         2.0
## 112          6.4         2.7          5.3         1.9
## 113          6.8         3.0          5.5         2.1
## 114          5.7         2.5          5.0         2.0
## 115          5.8         2.8          5.1         2.4
## 116          6.4         3.2          5.3         2.3
## 117          6.5         3.0          5.5         1.8
## 118          7.7         3.8          6.7         2.2
## 119          7.7         2.6          6.9         2.3
## 120          6.0         2.2          5.0         1.5
## 121          6.9         3.2          5.7         2.3
## 122          5.6         2.8          4.9         2.0
## 123          7.7         2.8          6.7         2.0
## 124          6.3         2.7          4.9         1.8
## 125          6.7         3.3          5.7         2.1
## 126          7.2         3.2          6.0         1.8
## 127          6.2         2.8          4.8         1.8
## 128          6.1         3.0          4.9         1.8
## 129          6.4         2.8          5.6         2.1
## 130          7.2         3.0          5.8         1.6
## 131          7.4         2.8          6.1         1.9
## 132          7.9         3.8          6.4         2.0
## 133          6.4         2.8          5.6         2.2
## 134          6.3         2.8          5.1         1.5
## 135          6.1         2.6          5.6         1.4
## 136          7.7         3.0          6.1         2.3
## 137          6.3         3.4          5.6         2.4
## 138          6.4         3.1          5.5         1.8
## 139          6.0         3.0          4.8         1.8
## 140          6.9         3.1          5.4         2.1
## 141          6.7         3.1          5.6         2.4
## 142          6.9         3.1          5.1         2.3
## 143          5.8         2.7          5.1         1.9
## 144          6.8         3.2          5.9         2.3
## 145          6.7         3.3          5.7         2.5
## 146          6.7         3.0          5.2         2.3
## 147          6.3         2.5          5.0         1.9
## 148          6.5         3.0          5.2         2.0
## 149          6.2         3.4          5.4         2.3
## 150          5.9         3.0          5.1         1.8
cor(test_vi)
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000   0.4572278    0.8642247   0.2811077
## Sepal.Width     0.4572278   1.0000000    0.4010446   0.5377280
## Petal.Length    0.8642247   0.4010446    1.0000000   0.3221082
## Petal.Width     0.2811077   0.5377280    0.3221082   1.0000000
plot(test_vi)

corrplot(cor(test_vi))

#꽃받침 길이랑 꽃잎 길이가 연관이 높다

#한번에 다 보면 어떻게 될까
cor(iris[, 1:4])
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1175698    0.8717538   0.8179411
## Sepal.Width    -0.1175698   1.0000000   -0.4284401  -0.3661259
## Petal.Length    0.8717538  -0.4284401    1.0000000   0.9628654
## Petal.Width     0.8179411  -0.3661259    0.9628654   1.0000000
corrplot(cor(iris[, 1:4]))

#어우.. 
#회귀분석 : 두 변수간의 관계. linear model = lm
testvi_lm <- lm(Sepal.Length ~ Petal.Length, data = test_vi)
testvi_lm
## 
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = test_vi)
## 
## Coefficients:
##  (Intercept)  Petal.Length  
##       1.0597        0.9957
summary(testvi_lm) #회귀모델 결과 확인
## 
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = test_vi)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.73409 -0.23643 -0.03132  0.23771  0.76207 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.05966    0.46677    2.27   0.0277 *  
## Petal.Length  0.99574    0.08367   11.90  6.3e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3232 on 48 degrees of freedom
## Multiple R-squared:  0.7469, Adjusted R-squared:  0.7416 
## F-statistic: 141.6 on 1 and 48 DF,  p-value: 6.298e-16
#p값. 0.05보다 작다. 신뢰수준 95% 유의. 귀무가설x. 
#R값. 결정계수가 1에 가까울수록 회귀모델의 성능이 뛰어나다.
names(testvi_lm)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
plot(test_vi$Sepal.Length, test_vi$Petal.Length)

#의사결정트리
# rpart 함수를 써서 분류
rpart_model <- rpart(Species ~ ., data = iris)
rpart_model
## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
##     7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *
# 시각화
rpart.plot(rpart_model)

# Sepal.Length와 Sepal.Width의 산점도 그리기
plot(iris$Sepal.Length, iris$Sepal.Width, 
     xlab = "Sepal.Length", ylab = "Sepal.Width", 
     main = "Sepal.Length vs Sepal.Width Scatter Plot",
     col = iris$Species)

# Sepal.Length와 Petal.Length의 산점도 그리기
plot(iris$Sepal.Length, iris$Petal.Length, 
     xlab = "Sepal.Length", ylab = "Petal.Length", 
     main = "Sepal.Length vs Petal.Length Scatter Plot",
     col = iris$Species)

# Sepal.Length와 Petal.Width의 산점도 그리기
plot(iris$Sepal.Length, iris$Petal.Width, 
     xlab = "Sepal.Length", ylab = "Petal.Width", 
     main = "Sepal.Length vs Petal.Width Scatter Plot",
     col = iris$Species)

# Sepal.Width와 Petal.Length의 산점도 그리기
plot(iris$Sepal.Width, iris$Petal.Length, 
     xlab = "Sepal.Width", ylab = "Petal.Length", 
     main = "Sepal.Width vs Petal.Length Scatter Plot",
     col = iris$Species)

# Sepal.Width와 Petal.Width의 산점도 그리기
plot(iris$Sepal.Width, iris$Petal.Width, 
     xlab = "Sepal.Width", ylab = "Petal.Width", 
     main = "Sepal.Width vs Petal.Width Scatter Plot",
     col = iris$Species)

# Petal.Length와 Petal.Width의 산점도 그리기
plot(iris$Petal.Length, iris$Petal.Width, 
     xlab = "Petal.Length", ylab = "Petal.Width", 
     main = "Petal.Length vs Petal.Width Scatter Plot",
     col = iris$Species)

# Sepal.Length와 Sepal.Width의 산점도 그래프
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
  geom_point() +
  labs(title = "Sepal.Length vs. Sepal.Width",
       x = "Sepal.Length",
       y = "Sepal.Width",
       color = "Species")

# Petal.Length와 Petal.Width의 산점도 그래프
ggplot(iris, aes(x = Sepal.Length, y = Petal.Width, color = Species)) +
  geom_point() +
  labs(title = "Sepal.Length vs. Sepal.Width",
       x = "Sepal.Length",
       y = "Sepal.Width",
       color = "Species")

pairs(iris[, 1:4], col = iris$Species, pch = 19)

# 열 이름 너무 기니까 변경
names(iris) <- c("sl", "sw", "pl", "pw", "s")

# 훈련 데이터와 테스트 데이터 분리
set.seed(42)
trainIndex <- sample(1:nrow(iris), nrow(iris)*0.8)
trainSet <- iris[trainIndex, ]
testSet <- iris[-trainIndex, ]
# 랜덤 포레스트로 학습 및 예측
model_rf <- randomForest(s ~ ., data=trainSet, type="class")
rf.pred <- predict(model_rf, testSet)
rf.accuracy <- sum(rf.pred == testSet$s) / length(testSet$s)
print(paste("Random Forest Accuracy: ", rf.accuracy))
## [1] "Random Forest Accuracy:  0.933333333333333"
# Species를 수치형으로 변경하고 훈련 및 테스트 데이터를 행렬로 변환
trainSet$s <- as.numeric(trainSet$s) - 1
testSet$s <- as.numeric(testSet$s) - 1
trainSet_x <- as.matrix(trainSet[, -5])
trainSet_y <- trainSet[, 5]
testSet_x <- as.matrix(testSet[, -5])
testSet_y <- testSet[, 5]

# xgboost 파라미터 설정
params <- list("objective" = "multi:softprob",
               "eval_metric" = "mlogloss",
               "num_class" = 3)

# xgboost로 학습 및 예측
xgb.model <- xgboost(data = trainSet_x, label = trainSet_y, params = params, nrounds = 100)
## [1]  train-mlogloss:0.734324 
## [2]  train-mlogloss:0.522261 
## [3]  train-mlogloss:0.385193 
## [4]  train-mlogloss:0.291406 
## [5]  train-mlogloss:0.225320 
## [6]  train-mlogloss:0.177849 
## [7]  train-mlogloss:0.143049 
## [8]  train-mlogloss:0.117298 
## [9]  train-mlogloss:0.098051 
## [10] train-mlogloss:0.082183 
## [11] train-mlogloss:0.069841 
## [12] train-mlogloss:0.060162 
## [13] train-mlogloss:0.051834 
## [14] train-mlogloss:0.046324 
## [15] train-mlogloss:0.041908 
## [16] train-mlogloss:0.038578 
## [17] train-mlogloss:0.035833 
## [18] train-mlogloss:0.033422 
## [19] train-mlogloss:0.031589 
## [20] train-mlogloss:0.030162 
## [21] train-mlogloss:0.029318 
## [22] train-mlogloss:0.028246 
## [23] train-mlogloss:0.027818 
## [24] train-mlogloss:0.027204 
## [25] train-mlogloss:0.026556 
## [26] train-mlogloss:0.026178 
## [27] train-mlogloss:0.025589 
## [28] train-mlogloss:0.025013 
## [29] train-mlogloss:0.024462 
## [30] train-mlogloss:0.023902 
## [31] train-mlogloss:0.023641 
## [32] train-mlogloss:0.023310 
## [33] train-mlogloss:0.022994 
## [34] train-mlogloss:0.022758 
## [35] train-mlogloss:0.022280 
## [36] train-mlogloss:0.022048 
## [37] train-mlogloss:0.021611 
## [38] train-mlogloss:0.021394 
## [39] train-mlogloss:0.021126 
## [40] train-mlogloss:0.020916 
## [41] train-mlogloss:0.020662 
## [42] train-mlogloss:0.020475 
## [43] train-mlogloss:0.020233 
## [44] train-mlogloss:0.020016 
## [45] train-mlogloss:0.019841 
## [46] train-mlogloss:0.019675 
## [47] train-mlogloss:0.019498 
## [48] train-mlogloss:0.019332 
## [49] train-mlogloss:0.019175 
## [50] train-mlogloss:0.019011 
## [51] train-mlogloss:0.018858 
## [52] train-mlogloss:0.018720 
## [53] train-mlogloss:0.018567 
## [54] train-mlogloss:0.018425 
## [55] train-mlogloss:0.018289 
## [56] train-mlogloss:0.018146 
## [57] train-mlogloss:0.018019 
## [58] train-mlogloss:0.017889 
## [59] train-mlogloss:0.017760 
## [60] train-mlogloss:0.017644 
## [61] train-mlogloss:0.017516 
## [62] train-mlogloss:0.017397 
## [63] train-mlogloss:0.017284 
## [64] train-mlogloss:0.017163 
## [65] train-mlogloss:0.017054 
## [66] train-mlogloss:0.016951 
## [67] train-mlogloss:0.016845 
## [68] train-mlogloss:0.016750 
## [69] train-mlogloss:0.016654 
## [70] train-mlogloss:0.016562 
## [71] train-mlogloss:0.016475 
## [72] train-mlogloss:0.016384 
## [73] train-mlogloss:0.016302 
## [74] train-mlogloss:0.016219 
## [75] train-mlogloss:0.016138 
## [76] train-mlogloss:0.016063 
## [77] train-mlogloss:0.015985 
## [78] train-mlogloss:0.015910 
## [79] train-mlogloss:0.015839 
## [80] train-mlogloss:0.015768 
## [81] train-mlogloss:0.015697 
## [82] train-mlogloss:0.015628 
## [83] train-mlogloss:0.015559 
## [84] train-mlogloss:0.015491 
## [85] train-mlogloss:0.015423 
## [86] train-mlogloss:0.015360 
## [87] train-mlogloss:0.015304 
## [88] train-mlogloss:0.015252 
## [89] train-mlogloss:0.015203 
## [90] train-mlogloss:0.015159 
## [91] train-mlogloss:0.015087 
## [92] train-mlogloss:0.015044 
## [93] train-mlogloss:0.015004 
## [94] train-mlogloss:0.014936 
## [95] train-mlogloss:0.014895 
## [96] train-mlogloss:0.014858 
## [97] train-mlogloss:0.014826 
## [98] train-mlogloss:0.014795 
## [99] train-mlogloss:0.014765 
## [100]    train-mlogloss:0.014695
xgb.pred <- predict(xgb.model, testSet_x)
xgb.pred <- matrix(xgb.pred, ncol = 3, byrow = TRUE)
xgb.pred.labels <- max.col(xgb.pred) - 1
xgb.accuracy <- sum(testSet_y == xgb.pred.labels) / length(testSet_y)
print(paste("XGBoost Accuracy: ", xgb.accuracy))
## [1] "XGBoost Accuracy:  0.966666666666667"
# 혼돈 행렬 출력
cm <- confusionMatrix(as.factor(xgb.pred.labels), as.factor(testSet_y))
print(cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2
##          0  9  0  0
##          1  0 10  0
##          2  0  1 10
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.8278, 0.9992)
##     No Information Rate : 0.3667          
##     P-Value [Acc > NIR] : 4.476e-12       
##                                           
##                   Kappa : 0.9499          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2
## Sensitivity               1.0   0.9091   1.0000
## Specificity               1.0   1.0000   0.9500
## Pos Pred Value            1.0   1.0000   0.9091
## Neg Pred Value            1.0   0.9500   1.0000
## Prevalence                0.3   0.3667   0.3333
## Detection Rate            0.3   0.3333   0.3333
## Detection Prevalence      0.3   0.3333   0.3667
## Balanced Accuracy         1.0   0.9545   0.9750
# xgboost 변수 중요도 시각화
importance_matrix <- xgb.importance(model = xgb.model)
xgb.plot.importance(importance_matrix)

# 정확도 비교
print(paste("정확도 차이(랜덤포레스트-xgboost): ", abs(rf.accuracy - xgb.accuracy)))
## [1] "정확도 차이(랜덤포레스트-xgboost):  0.0333333333333333"